import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
# preprocessing
from sklearn.preprocessing import StandardScaler
# keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils.vis_utils import plot_model
import keras.backend as K
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
Anomaly detection is a classification process in which rare items, events, or observations in data sets are identified. Learn more about this here. In this article, we investigate Credit Card Fraud Detection dataset from Kaggle.com.
Credit card companies must be able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.
The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
Data = pd.read_csv('Data/creditcard.csv',sep=',')
Col = []
# Temp = re.findall("(\d+)", s)
for s in Data.columns:
if any(map(str.isdigit, s)) == True:
Temp = s.split('V')
Col.append('V'+ Temp[-1].zfill(2))
else:
Col.append(s)
Data.columns = Col
del Col
display(pd.DataFrame(Data.shape, columns = ['Count'], index = ['Attributes', 'Instances']).T)
Labels = ['Normal', 'Fraud']
Temp = Data['Class'].value_counts(sort = False).to_frame('Count').reset_index()
Temp.columns = ['Class','Count']
Temp['Class'] = Temp['Class'].map(lambda x: Labels[0] if x == 0 else Labels[1])
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
display(Temp.style.hide_index().set_precision(2))
fig = px.bar(Temp, y= 'Class', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Transaction Class Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
As can be seen, nearly, 99.83 percent of the dataset are labeled as Normal.
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.hist(Data.loc[Data.Class == 0, 'Amount'], 100, color = '#34495e', hatch = '/', lw = 1.5,
edgecolor = '#3498db', label = Labels[0])
_ = ax.hist(Data.loc[Data.Class == 1, 'Amount'], 10, Color = '#e74c3c', hatch = '\\', lw = 1.5,
edgecolor = 'DarkRed', label = Labels[1])
_ = ax.set_xlabel('Amount')
_ = ax.set_ylabel('Number of Transactions')
_ = ax.set_xlim([0, 2e4])
_ = ax.set_yscale('log')
_ = ax.set_ylim([0, 1e6])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.scatter(Data.loc[Data.Class == 0, 'Time'], Data.loc[Data.Class == 0, 'Amount'], s= 30,
facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8, label = Labels[0])
_ = ax.scatter(Data.loc[Data.Class == 1, 'Time'], Data.loc[Data.Class == 1, 'Amount'], s= 30,
facecolors='Orange', edgecolors='DarkRed', alpha = 1, label = Labels[1])
_ = ax.set_xlabel('Time (in seconds)')
_ = ax.set_ylabel('Amount')
_ = ax.set_xlim([-500, Data.Time.max()+500])
_ = ax.set_ylim([-250, 2e4])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)
The Dataset is quite large, we would like to use pandas DataFrame sample feature with using a one-tenth of the data as a sample.
df= Data.sample(frac = 0.1, random_state=1)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Size'] = Inp.shape[0]
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
display(df.head())
Data_info(df)
First off, let's define $X$ and $y$ sets.
Target = 'Class'
X = df.drop(columns = [Target])
y = df[Target]
Now, let's take a look at the variance of the features.
display(X.var().sort_values(ascending = False).to_frame(name= 'Variance').T.style.set_precision(2))
As can see some of the variables have high variance and this is not desirable for our modeling. Thus, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
X = StandardScaler().fit_transform(X)
Correlations of features with Class.
Temp = pd.DataFrame(X, columns = df.drop(columns = [Target]).columns)
Temp[Target] = y
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (Temp, 16)
y = pd.get_dummies(df[Target]).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Here, we implement an artificial neural network (ANN) using Keras Sequential model.
model = Sequential()
model.add(Dense(16, input_dim= X.shape[1], init='uniform', activation='sigmoid', name="Hidden_Layer_01"))
model.add(Dense(8, init='uniform', activation='sigmoid', name="Hidden_Layer_02"))
model.add(Dense(y.shape[1], init='uniform', activation='sigmoid', name="Output_Layer"))
# Number of iterations
IT = 121
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy','mae', 'mse'])
# Train model
history = model.fit(X_train, y_train, nb_epoch= IT, batch_size=10, verbose=0)
# Predications and Score
y_pred = model.predict(X_test)
score = model.evaluate(X_test, y_test)
score = pd.DataFrame(score, index = model.metrics_names).T
history = pd.DataFrame(history.history)
display(score.style.hide_index())
fig = go.Figure()
fig.add_trace(go.Scatter(x= history.index.values, y= history['loss'].values, line=dict(color='OrangeRed', width= 1.5),
name = 'Loss'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['accuracy'].values, line=dict(color='MidnightBlue', width= 1.5),
name = 'Accuracy'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mae'].values, line=dict(color='ForestGreen', width= 1.5),
name = 'Mean Absolute Error (MAE)'))
fig.add_trace(go.Scatter(x= history.index.values, y= history['mse'].values, line=dict(color='purple', width= 1.5),
name = 'Mean Squared Error (MSE)'))
fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=12))
fig.update_layout(dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig['layout']['xaxis'].update(range=[0, history.index.values.max()])
# fig['layout']['yaxis'].update(range=[0, 1.6])
fig.show()
plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)
Using the following code gives our model diagram
from ann_visualizer.visualize import ann_viz
from pdf2image import convert_from_path
ann_viz(model, filename = 'Model01',title="The Model");
for Img in convert_from_path('Model01.pdf'):
Img.save('Model01.jpg', 'JPEG')

Next, we can plot confusion matrix for our classifier.
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
Confusion_Matrix = confusion_matrix(y_test.idxmax(axis=1),
pd.DataFrame(np.round(y_pred),columns = y_test.columns).astype(int).idxmax(axis=1))
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0])
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels');
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)
_ = sns.heatmap(Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis],\
annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[1])
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels');
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)